{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyMwuvFjUcDJN0VQn5KaTqqo",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/tomasonjo/blogs/blob/master/h20_llm/LLM_train_dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CpubjRJQ4uuY",
        "collapsed": true
      },
      "outputs": [],
      "source": [
        "!pip install openai"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import json\n",
        "import pandas as pd\n",
        "import openai\n",
        "\n",
        "openai.api_key = \"OPENAI_KEY\""
      ],
      "metadata": {
        "id": "I3F2ILJT4vV2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "examples = \"\"\"\n",
        "# Who played in Top Gun?\n",
        "MATCH (m:Movie)<-[r:ACTED_IN]-(a)\n",
        "RETURN {actor: a.name, role: r.role} AS result\n",
        "# What is the plot of the Copycat movie?\n",
        "MATCH (m:Movie {title: \"Copycat\"})\n",
        "RETURN {plot: m.plot} AS result\n",
        "# Did Luis Guzmán appear in any other movies?\n",
        "MATCH (p:Person {name:\"Luis Guzmán\"})-[r:ACTED_IN]->(movie)\n",
        "RETURN {movie: movie.title, role: r.role} AS result\n",
        "# Do you know of any matrix movies?\n",
        "MATCH (m:Movie)\n",
        "WHERE toLower(m.title) CONTAINS toLower(\"matrix\")\n",
        "RETURN {movie:m.title} AS result\n",
        "# How many reviews does each Matrix movie have?\n",
        "MATCH (m:Movie)<-[:RATED]-(u:User)\n",
        "WHERE m.title CONTAINS 'Matrix'\n",
        "WITH m, count(*) AS reviews\n",
        "RETURN m.title AS movie, reviews\n",
        "ORDER BY reviews DESC LIMIT 5;\n",
        "# Recommend me a similar movie to Crimson Tide\n",
        "MATCH (m:Movie {title: 'Crimson Tide'})<-[:RATED]-\n",
        "      (u:User)-[:RATED]->(rec:Movie)\n",
        "WITH rec, COUNT(*) AS usersWhoAlsoWatched\n",
        "ORDER BY usersWhoAlsoWatched DESC LIMIT 25\n",
        "RETURN rec.title AS recommendation, usersWhoAlsoWatched\n",
        "# Find me a good comedy?\n",
        "MATCH (m:Movie)-[:IN_GENRE]->(:Genre {name:\"Comedy\"})\n",
        "RETURN {movie: m.title} AS result\n",
        "ORDER BY m.imdbRating DESC LIMIT 1\n",
        "# When was Copycat released?\n",
        "MATCH (m:Movie {title:\"CopyCat\"})\n",
        "RETURN {year: m.year} AS result\n",
        "\"\"\""
      ],
      "metadata": {
        "id": "EaXbgtB65YhF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "system = f\"\"\"\n",
        "You are an assistant that has only one task.\n",
        "You need to generate 100 Cypher query examples based on the movie dataset.\n",
        "Do not respond with any explanations and do not apologize.\n",
        "Here are some query examples:\n",
        "{examples}\n",
        "Respond with format where each line represents one example:\n",
        "{{\"instruction\": \"Who played in Top Gun?\", 'output': \"MATCH (m:Movie)<-[r:ACTED_IN]-(a) RETURN {{actor: a.name, role: r.role}} AS result\"}}\n",
        "Do not return any examples that cannot be inferred from provided queries, so no new node labels of relationship types.\n",
        "Do not include examples I have provided and do not use Matrix in the examples.\n",
        "\"\"\""
      ],
      "metadata": {
        "id": "ij-hg_Cy400q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def parse_response(text):\n",
        "  # Split the response string by newline characters to get individual JSON strings\n",
        "  response_list = text.split('\\n')\n",
        "\n",
        "  # Parse each JSON string as a dictionary and append it to a list\n",
        "  parsed_list = []\n",
        "  for r in response_list:\n",
        "      if r.strip() != '':\n",
        "        try:\n",
        "          parsed_list.append(json.loads(r))\n",
        "        except:\n",
        "          pass\n",
        "\n",
        "  return parsed_list"
      ],
      "metadata": {
        "id": "7nMjgeEdEHXY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# GPT-4 has a output token limit around 1100 tokens\n",
        "# So we do 20 requests and increase the temperature to\n",
        "# encourage various Cypher examples\n",
        "\n",
        "training_data = []\n",
        "for i in range(20):\n",
        "    print(f\"Create {i} batch of examples\")\n",
        "    completions = openai.ChatCompletion.create(\n",
        "        model=\"gpt-4\",\n",
        "        temperature=0.6,\n",
        "        max_tokens=6000,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": system},\n",
        "            {\"role\": \"user\", \"content\": \"Generate 10 examples\"},\n",
        "        ],\n",
        "    )\n",
        "    response = completions.choices[0].message.content\n",
        "    training_data.extend(parse_response(response))\n",
        "\n",
        "df = pd.DataFrame.from_records(training_data)\n",
        "df[\"instruction\"] = [\n",
        "    \"Create a Cypher statement to answer the following question: \" + el\n",
        "    for el in df[\"instruction\"]\n",
        "]\n",
        "df.to_csv(\"train.csv\", index=False)"
      ],
      "metadata": {
        "id": "_ZjxpSET4zf5",
        "collapsed": true
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#from google.colab import files\n",
        "#files.download('train.csv') "
      ],
      "metadata": {
        "id": "NneGopuw520f"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "val_data = []\n",
        "for i in range(4):\n",
        "    print(f\"Create {i} batch of examples\")\n",
        "    completions = openai.ChatCompletion.create(\n",
        "        model=\"gpt-4\",\n",
        "        temperature=0.6,\n",
        "        max_tokens=6000,\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": system},\n",
        "            {\"role\": \"user\", \"content\": \"Generate 10 examples\"},\n",
        "        ],\n",
        "    )\n",
        "    response = completions.choices[0].message.content\n",
        "    val_data.extend(parse_response(response))\n",
        "\n",
        "df = pd.DataFrame.from_records(val_data)\n",
        "df[\"instruction\"] = [\n",
        "    \"Create a Cypher statement to answer the following question: \" + el\n",
        "    for el in df[\"instruction\"]\n",
        "]\n",
        "df.to_csv(\"val.csv\", index=False)"
      ],
      "metadata": {
        "id": "yPjuvPGs86KN",
        "collapsed": true
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#from google.colab import files\n",
        "#files.download('vald.csv') "
      ],
      "metadata": {
        "id": "E_wDaMQFTOmu"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}